Read files and plot data
GC content was analyzed with Picard (see Evernote ELN).
#load all the files as a list of dataframes
gc_picard=c("ctrl_gc_bias_coli", "ctrl_gc_bias_subtilis","ctrl_gc_bias_PAE","ctrl_gc_bias_kocuria",
"ctrl2_gc_bias_coli", "ctrl2_gc_bias_subtilis","ctrl2_gc_bias_PAE","ctrl2_gc_bias_kocuria",
"N4_gc_bias_coli", "N4_gc_bias_subtilis","N4_gc_bias_PAE","N4_gc_bias_kocuria",
"D3_gc_bias_coli", "D3_gc_bias_subtilis","D3_gc_bias_PAE","D3_gc_bias_kocuria",
"C3_gc_bias_coli", "C3_gc_bias_subtilis","C3_gc_bias_PAE","C3_gc_bias_kocuria",
"N1_gc_bias_coli", "N1_gc_bias_subtilis","N1_gc_bias_PAE","N1_gc_bias_kocuria",
"F2_gc_bias_coli", "F2_gc_bias_subtilis","F2_gc_bias_PAE","F2_gc_bias_kocuria",
"N2_gc_bias_coli", "N2_gc_bias_subtilis","N2_gc_bias_PAE","N2_gc_bias_kocuria",
"B2_gc_bias_coli", "B2_gc_bias_subtilis","B2_gc_bias_PAE","B2_gc_bias_kocuria",
"N6_gc_bias_coli", "N6_gc_bias_subtilis","N6_gc_bias_PAE","N6_gc_bias_kocuria",
"A2_gc_bias_coli", "A2_gc_bias_subtilis","A2_gc_bias_PAE","A2_gc_bias_kocuria")
gc <- lapply(gc_picard, function(x) read.csv2(paste(x,".txt",sep=""), skip=6,header=TRUE,sep="\t", colClasses=c("NULL","NULL","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric")))
#plot all the samples using a loop
library(ggplot2)
library(ggpubr)
cor_matrix <- data.frame(44,3)
colors <- c("Mean quality"="grey","Normalized coverage"="firebrick","GC"="steelblue")
samples <- c("NA","NA2","RepliG","RepliG2","TruePrime","piPolB","piPolB+D","piMDA","piMDA2","piMDA+D","piMDA+D2")
templates <- c("E. coli","B. subtilis 110NA", "P. aeruginosa PAER4","K. rhizophila")
genomas <- merge(templates, samples ,all=TRUE)
plot_list <- list()
for (i in 1:length(gc)){
p <- ggplot(data=gc[[i]],aes(x=gc[[i]]$GC,y=gc[[i]]$WINDOWS)) +
geom_bar(stat="identity",fill="steelblue")+
geom_line(aes(y=gc[[i]]$MEAN_BASE_QUALITY*7000,color="Mean quality"))+
geom_point(aes(y=gc[[i]]$NORMALIZED_COVERAGE*200000,color="Normalized coverage"))+
scale_y_continuous("Frequency at GC", sec.axis=sec_axis(~./200000,name="Normalized coverage"))+
scale_color_manual(name="",values=colors)+
xlab("GC content") +
ggtitle(paste(genomas[i,1],"de la muestra",genomas[i,2])) +
theme_bw(base_size=14)
print(p)
}
Correlations 1: GC content vs. Normalized Coverage
cor_matrix <- data.frame(44,3)
for (i in 1:length(gc)){
gc[[i]] <- gc[[i]][gc[[i]]$WINDOWS>5,] #remove 0 values
cor_matrix[i,1] <- gc_picard[i]
tmp <- cor.test(gc[[i]]$GC,gc[[i]]$NORMALIZED_COVERAGE)
cor_matrix[i,2] <-tmp$estimate
cor_matrix[i,3] <-tmp$p.value
}
corr_frame <- data.frame(matrix(ncol = 11, nrow = 4))
#split the data to genome vs sample
#GC vs Normalized Coverage
for (i in 1:ncol(corr_frame)){
if (i==1){
corr_frame[1:4,i] <- cor_matrix[1:4,2]
}else{
corr_frame[1:4,i] <- cor_matrix[(4*i)-3:4*i,2]
}
}
corr_frame <- sapply(corr_frame, as.numeric)
colnames(corr_frame) <- c("NA","NA2","RepliG","RepliG2","TruePrime","piPolB","piPolB+D","piMDA","piMDA2","piMDA+D","piMDA+D2")
row.names(corr_frame) <- c("E. coli","B. subtilis 110NA", "P. aeruginosa PAER4","K. rhizophila")
#plot
library(corrplot)
## corrplot 0.92 loaded
corrplot(as.matrix(corr_frame), tl.col="black", addCoef.col = 1, number.cex = 1)
#include the p-values
corr_frame2 <- data.frame(matrix(ncol = 11, nrow = 4))
for (i in 1:ncol(corr_frame2)){
if (i==1){
corr_frame2[1:4,i] <- cor_matrix[1:4,3]
}else{
corr_frame2[1:4,i] <- cor_matrix[(4*i)-3:4*i,3]
}
}
corr_frame2 <- sapply(corr_frame2, as.numeric)
colnames(corr_frame2) <- colnames(corr_frame)
row.names(corr_frame2) <- row.names(corr_frame)
#plot
corrplot(as.matrix(corr_frame), tl.col="black", p.mat=corr_frame2, addCoef.col = 1,
number.cex = 1)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
#multiple correlation
cor_matrix2 <- gc[[1]]$NORMALIZED_COVERAGE
for (i in 2:length(gc)){
gc[[i]] <- gc[[i]][gc[[i]]$WINDOWS!=0,] #remove 0 values
cor_matrix2 <-cbindX(as.data.frame(cor_matrix2),as.data.frame(gc[[i]]$NORMALIZED_COVERAGE))
}
colnames(cor_matrix2) <-paste0(genomas[,2],"-",genomas[,1])
testRes = cor.mtest(cor_matrix2, conf.level = 0.95)
corrplot(cor(cor_matrix2, use="complete.obs"), tl.col="black", p.mat = testRes$p)
Correlations 1: Windows vs. Normalized Coverage
cor_matrix <- data.frame(44,3)
for (i in 1:length(gc)){
gc[[i]] <- gc[[i]][gc[[i]]$WINDOWS>5,] #remove 0 values
cor_matrix[i,1] <- gc_picard[i]
tmp <- cor.test(gc[[i]]$WINDOWS,gc[[i]]$NORMALIZED_COVERAGE)
cor_matrix[i,2] <-tmp$estimate
cor_matrix[i,3] <-tmp$p.value
}
corr_frame <- data.frame(matrix(ncol = 11, nrow = 4))
#split the data to genome vs sample
#GC vs Normalized Coverage
for (i in 1:ncol(corr_frame)){
if (i==1){
corr_frame[1:4,i] <- cor_matrix[1:4,2]
}else{
corr_frame[1:4,i] <- cor_matrix[(4*i)-3:4*i,2]
}
}
corr_frame <- sapply(corr_frame, as.numeric)
colnames(corr_frame) <- c("NA","NA2","RepliG","RepliG2","TruePrime","piPolB","piPolB+D","piMDA","piMDA2","piMDA+D","piMDA+D2")
row.names(corr_frame) <- c("E. coli","B. subtilis 110NA", "P. aeruginosa PAER4","K. rhizophila")
#plot
library(corrplot)
corrplot(as.matrix(corr_frame), tl.col="black", addCoef.col = 1, number.cex = 1)
#include the p-values
corr_frame2 <- data.frame(matrix(ncol = 11, nrow = 4))
for (i in 1:ncol(corr_frame2)){
if (i==1){
corr_frame2[1:4,i] <- cor_matrix[1:4,3]
}else{
corr_frame2[1:4,i] <- cor_matrix[(4*i)-3:4*i,3]
}
}
corr_frame2 <- sapply(corr_frame2, as.numeric)
colnames(corr_frame2) <- colnames(corr_frame)
row.names(corr_frame2) <- row.names(corr_frame)
#plot
corrplot(as.matrix(corr_frame), tl.col="black", p.mat=corr_frame2, addCoef.col = 1,
number.cex = 1)
library(gdata)
#multiple correlation
cor_matrix2 <- gc[[1]]$NORMALIZED_COVERAGE
for (i in 2:length(gc)){
gc[[i]] <- gc[[i]][gc[[i]]$WINDOWS!=0,] #remove 0 values
cor_matrix2 <-cbindX(as.data.frame(cor_matrix2),as.data.frame(gc[[i]]$NORMALIZED_COVERAGE))
}
testRes = cor.mtest(cor_matrix2, conf.level = 0.95)
colnames(cor_matrix2) <-paste0(genomas[,2],"-",genomas[,1])
corrplot(cor(cor_matrix2, use="complete.obs"),tl.col="black", p.mat = testRes$p)